# Calculus
import pandas as pd
import os
# Data viz
import plotly.express as px
import plotly.io as pio
pio.renderers.default='notebook'
columns_keep = [
#'division_number',
'date_reported',
'date_occurred',
#'area',
'area_name',
#'reporting_district',
#'part',
#'crime_code',
'crime_description',
#'modus_operandi',
'victim_age',
'victim_sex',
'victim_descent',
#'premise_code',
'premise_description',
#'weapon_code',
'weapon_description',
#'status',
'status_description',
#'crime_code_1',
#'crime_code_2',
#'crime_code_3',
#'crime_code_4',
'location',
#'cross_street',
'latitude',
'longitude',
]
# fetch all the data from the raw_data folder
current_dir = os.getcwd()
current_dir
file_path = os.path.join(current_dir, '..', 'raw_data', 'data.csv')
df = pd.read_csv(file_path)
df = df[columns_keep]
df['counter']=1
# Dates
df['date_occurred'] = pd.to_datetime(df['date_occurred'], errors='coerce')
df['year_occurred'] = df['date_occurred'].dt.year
df['month_occurred'] = df['date_occurred'].dt.month
df['hour_occurred'] = df['date_occurred'].dt.hour
df.head(3)
| date_reported | date_occurred | area_name | crime_description | victim_age | victim_sex | victim_descent | premise_description | weapon_description | status_description | location | latitude | longitude | counter | year_occurred | month_occurred | hour_occurred | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2020-01-08 | 2020-01-08 22:30:00 | Southwest | BATTERY - SIMPLE ASSAULT | 36 | F | B | SINGLE FAMILY DWELLING | STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE) | Adult Other | 1100 W 39TH PL | 34.0141 | -118.2978 | 1 | 2020 | 1 | 22 |
| 1 | 2020-01-02 | 2020-01-01 03:30:00 | Central | BATTERY - SIMPLE ASSAULT | 25 | M | H | SIDEWALK | UNKNOWN WEAPON/OTHER WEAPON | Invest Cont | 700 S HILL ST | 34.0459 | -118.2545 | 1 | 2020 | 1 | 3 |
| 2 | 2020-04-14 | 2020-02-13 12:00:00 | Central | SEX OFFENDER REGISTRANT OUT OF COMPLIANCE | 0 | X | X | POLICE FACILITY | NaN | Adult Arrest | 200 E 6TH ST | 34.0448 | -118.2474 | 1 | 2020 | 2 | 12 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 852950 entries, 0 to 852949 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date_reported 852950 non-null object 1 date_occurred 852950 non-null datetime64[ns] 2 area_name 852950 non-null object 3 crime_description 852950 non-null object 4 victim_age 852950 non-null int64 5 victim_sex 740344 non-null object 6 victim_descent 740336 non-null object 7 premise_description 852432 non-null object 8 weapon_description 296748 non-null object 9 status_description 852950 non-null object 10 location 852950 non-null object 11 latitude 852950 non-null float64 12 longitude 852950 non-null float64 13 counter 852950 non-null int64 14 year_occurred 852950 non-null int64 15 month_occurred 852950 non-null int64 16 hour_occurred 852950 non-null int64 dtypes: datetime64[ns](1), float64(2), int64(5), object(9) memory usage: 110.6+ MB
null_counts = df.isnull().sum()
# Convert to DataFrame for better visualization
null_counts_df = pd.DataFrame(null_counts, columns=['Number of Nulls'])
null_counts_df = null_counts_df[null_counts_df['Number of Nulls']>0]
null_counts_df['Nulls_share'] = null_counts_df['Number of Nulls'] / len(df)
null_counts_df['Nulls_share'] =null_counts_df['Nulls_share'] .round(2)
null_counts_df =null_counts_df.sort_values(by='Nulls_share',ascending=False)
null_counts_df
| Number of Nulls | Nulls_share | |
|---|---|---|
| weapon_description | 556202 | 0.65 |
| victim_sex | 112606 | 0.13 |
| victim_descent | 112614 | 0.13 |
| premise_description | 518 | 0.00 |
df.describe()
| date_occurred | victim_age | latitude | longitude | counter | year_occurred | month_occurred | hour_occurred | |
|---|---|---|---|---|---|---|---|---|
| count | 852950 | 852950.000000 | 852950.000000 | 852950.000000 | 852950.0 | 852950.000000 | 852950.000000 | 852950.000000 |
| mean | 2022-01-09 05:17:37.460835840 | 29.742191 | 33.983232 | -118.040106 | 1.0 | 2021.532406 | 6.419488 | 13.185609 |
| min | 2020-01-01 00:01:00 | -3.000000 | 0.000000 | -118.667600 | 1.0 | 2020.000000 | 1.000000 | 0.000000 |
| 25% | 2021-01-26 15:00:00 | 5.000000 | 34.014100 | -118.429700 | 1.0 | 2021.000000 | 4.000000 | 9.000000 |
| 50% | 2022-01-30 12:42:00 | 31.000000 | 34.058500 | -118.321500 | 1.0 | 2022.000000 | 6.000000 | 14.000000 |
| 75% | 2022-12-25 15:00:00 | 45.000000 | 34.163200 | -118.273900 | 1.0 | 2022.000000 | 9.000000 | 19.000000 |
| max | 2023-12-04 23:00:00 | 120.000000 | 34.334300 | 0.000000 | 1.0 | 2023.000000 | 12.000000 | 23.000000 |
| std | NaN | 21.799470 | 1.756263 | 6.089068 | 0.0 | 1.099147 | 3.380271 | 6.535645 |
df.groupby('crime_description')['counter'].count().reset_index().sort_values(by='counter',ascending=False).head(5)
| crime_description | counter | |
|---|---|---|
| 132 | VEHICLE - STOLEN | 91473 |
| 4 | BATTERY - SIMPLE ASSAULT | 67976 |
| 115 | THEFT OF IDENTITY | 53467 |
| 21 | BURGLARY FROM VEHICLE | 52611 |
| 20 | BURGLARY | 51961 |
df.groupby('premise_description')['counter'].count().reset_index().sort_values(by='counter',ascending=False).head(10)
| premise_description | counter | |
|---|---|---|
| 266 | STREET | 216018 |
| 255 | SINGLE FAMILY DWELLING | 144367 |
| 207 | MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC) | 104461 |
| 227 | PARKING LOT | 59551 |
| 218 | OTHER BUSINESS | 40412 |
| 254 | SIDEWALK | 36700 |
| 298 | VEHICLE, PASSENGER/TRUCK | 25224 |
| 79 | GARAGE/CARPORT | 16673 |
| 63 | DRIVEWAY | 13838 |
| 246 | RESTAURANT/FAST FOOD | 10841 |
#weapon_description
df.groupby('weapon_description')['counter'].count().reset_index().sort_values(by='counter',ascending=False).head(10)
| weapon_description | counter | |
|---|---|---|
| 65 | STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE) | 159021 |
| 75 | UNKNOWN WEAPON/OTHER WEAPON | 31728 |
| 78 | VERBAL THREAT | 21767 |
| 27 | HAND GUN | 18330 |
| 58 | SEMI-AUTOMATIC PISTOL | 6657 |
| 32 | KNIFE WITH BLADE 6INCHES OR LESS | 6262 |
| 73 | UNKNOWN FIREARM | 6011 |
| 44 | OTHER KNIFE | 5361 |
| 39 | MACE/PEPPER SPRAY | 3390 |
| 77 | VEHICLE | 2979 |
#weapon_description
df.groupby('status_description')['counter'].count().reset_index().sort_values(by='counter',ascending=False).head(10)
| status_description | counter | |
|---|---|---|
| 2 | Invest Cont | 683107 |
| 1 | Adult Other | 91576 |
| 0 | Adult Arrest | 74004 |
| 3 | Juv Arrest | 2775 |
| 4 | Juv Other | 1484 |
| 5 | UNK | 4 |
df.groupby('area_name')['counter'].count().reset_index().sort_values(by='counter',ascending=False).head(10)
| area_name | counter | |
|---|---|---|
| 1 | Central | 57691 |
| 0 | 77th Street | 53631 |
| 12 | Pacific | 49850 |
| 15 | Southwest | 47858 |
| 6 | Hollywood | 45018 |
| 14 | Southeast | 43357 |
| 11 | Olympic | 42979 |
| 9 | Newton | 42743 |
| 8 | N Hollywood | 42455 |
| 20 | Wilshire | 40648 |
df.groupby(['area_name','year_occurred'])['counter'].count().reset_index().sort_values(by='counter',ascending=False).head(10)
| area_name | year_occurred | counter | |
|---|---|---|---|
| 6 | Central | 2022 | 17673 |
| 7 | Central | 2023 | 15273 |
| 2 | 77th Street | 2022 | 14532 |
| 62 | Southwest | 2022 | 13371 |
| 0 | 77th Street | 2020 | 13309 |
| 5 | Central | 2021 | 13156 |
| 1 | 77th Street | 2021 | 13074 |
| 50 | Pacific | 2022 | 13013 |
| 49 | Pacific | 2021 | 12917 |
| 3 | 77th Street | 2023 | 12716 |
df_evo = df.query(" year_occurred in (2021, 2023) ").pivot_table(index='area_name', columns='year_occurred',aggfunc='sum',values='counter').reset_index()
df_evo.columns=['area_name','2021','2023']
df_evo.head()
| area_name | 2021 | 2023 | |
|---|---|---|---|
| 0 | 77th Street | 13074 | 12716 |
| 1 | Central | 13156 | 15273 |
| 2 | Devonshire | 8487 | 8665 |
| 3 | Foothill | 7032 | 6418 |
| 4 | Harbor | 8858 | 8212 |
import pandas as pd
import plotly.express as px
# Assuming df_evo is your DataFrame and it's already defined
# Calculate the evolution from 2021 to 2023
df_evo['evol'] = df_evo['2023'].astype(float) / df_evo['2021'].astype(float) - 1
df_evo['evol'] = df_evo['evol'].round(2)
# Sort the DataFrame based on the evolution
df_evo = df_evo.sort_values(by='evol')
# Categorize the evolution into colors
df_evo['colors'] = df_evo['evol'].apply(lambda x:
'red' if x > 0.05
else ('green' if x < -0.05
else 'grey'))
# Define specific colors for each category
color_map = {
'green': '#32CD32', # Light green for positive evolution
'red': '#FF6347', # Tomato red for negative evolution
'grey': '#808080' # Grey for neutral
}
# Create a scatter plot with specified colors and area names
fig = px.scatter(df_evo, x="evol", y="2023", color='colors', title='Evolution of 2023 Compared to 2021',
color_discrete_map=color_map,
text='area_name') # Use 'area_name' instead of 'areaname'
# Set all markers to have grey borders and 70% opacity
fig.update_traces(marker=dict(size=12, line=dict(width=1, color='grey'), opacity=0.7))
# Improve layout for better readability
fig.update_traces(textposition='top center')
fig.update_layout(
showlegend=False,
plot_bgcolor='white', # Set plot background to white
paper_bgcolor='white' # Set overall figure background to white
)
# Show the plot
fig.show('notebook')
Central seems to be the most dangerous place, where crimes are also growing a lot.
fig = px.line(df.groupby(['area_name','year_occurred'])['counter'].count().reset_index().sort_values(by=['area_name','year_occurred'],ascending=False)
, x="year_occurred"
, y="counter"
,color = 'area_name'
, title='Crime evolution per place'
,markers=True
)
fig.show('notebook')
df.groupby('victim_age')['counter'].count().reset_index().sort_values(by='counter',ascending=False).head(5)
| victim_age | counter | |
|---|---|---|
| 3 | 0 | 211842 |
| 32 | 30 | 19421 |
| 37 | 35 | 19008 |
| 33 | 31 | 18603 |
| 31 | 29 | 18552 |
import plotly.express as px
# Assuming df is your DataFrame and it's already defined
fig = px.histogram(df, x="victim_age", title='Distribution of victims per age')
# Calculate the number of victims with age 0
num_victims_no_age = len(df[df['victim_age'] == 0])
# Add an annotation with an arrow
fig.add_annotation(
x=0, # Specifies the x-coordinate of the arrow's head
y=max(df['victim_age'].value_counts()), # Adjust this value as needed for y-coordinate
text=f"{num_victims_no_age} victims don't have an attributed age", # The text of the annotation
showarrow=True, # Show an arrow pointing to the specified coordinates
arrowhead=1 # The style of the arrowhead (1 is a simple arrow)
)
# Show the figure
fig.show('notebook')
# Assuming df is your DataFrame and it's already defined
fig = px.histogram( df.query(" victim_sex=='F' ")
, x="victim_age"
, color = 'victim_sex'
,title='Distribution of female victims per age')
# Show the figure
fig.show('notebook')
# Assuming df is your DataFrame and it's already defined
fig = px.histogram( df.query(" victim_sex=='M' ")
, x="victim_age"
, color = 'victim_sex'
,title='Distribution of males victims per age')
# Show the figure
fig.show('notebook')
df.groupby('victim_sex')['counter'].count().reset_index().sort_values(by='counter',ascending=False).head(5)
| victim_sex | counter | |
|---|---|---|
| 3 | M | 351362 |
| 1 | F | 313468 |
| 4 | X | 75420 |
| 2 | H | 93 |
| 0 | - | 1 |
df.groupby('victim_descent')['counter'].count().reset_index().sort_values(by='counter',ascending=False).head(5)
| victim_descent | counter | |
|---|---|---|
| 7 | H | 261145 |
| 17 | W | 173440 |
| 2 | B | 120896 |
| 18 | X | 83214 |
| 12 | O | 67532 |
df_victims = df.pivot_table( index='victim_descent',columns='victim_sex',aggfunc='sum',values='counter').reset_index()
df_victims = df_victims[['victim_descent','M','F']]
df_victims=df_victims.fillna(0)
df_victims['total']=df_victims['M']+df_victims['F']
df_victims = df_victims.sort_values(by='total',ascending=False)
df_victims['female_share'] = df_victims['F'] / df_victims['total']
df_victims['female_share']=df_victims['female_share'].round(2)
df_victims.head(5)
| victim_sex | victim_descent | M | F | total | female_share |
|---|---|---|---|---|---|
| 7 | H | 129647.0 | 131318.0 | 260965.0 | 0.50 |
| 17 | W | 100225.0 | 72995.0 | 173220.0 | 0.42 |
| 2 | B | 53081.0 | 67676.0 | 120757.0 | 0.56 |
| 12 | O | 43695.0 | 23741.0 | 67436.0 | 0.35 |
| 1 | A | 9657.0 | 9034.0 | 18691.0 | 0.48 |
df.head(1)
| date_reported | date_occurred | area_name | crime_description | victim_age | victim_sex | victim_descent | premise_description | weapon_description | status_description | location | latitude | longitude | counter | year_occurred | month_occurred | hour_occurred | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2020-01-08 | 2020-01-08 22:30:00 | Southwest | BATTERY - SIMPLE ASSAULT | 36 | F | B | SINGLE FAMILY DWELLING | STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE) | Adult Other | 1100 W 39TH PL | 34.0141 | -118.2978 | 1 | 2020 | 1 | 22 |
fig = px.line(df.pivot_table(index='year_occurred' ,aggfunc='sum' , values='counter' ).reset_index()
, x="year_occurred"
, y="counter"
, title='Nb of crimes per year'
,markers=True
)
fig.show('notebook')
df_grouped = df.pivot_table(index=['month_occurred'] ,aggfunc='sum' , values='counter' ).reset_index()
df_grouped['counter_mean']=df_grouped['counter'] / len(df_grouped)
df_grouped
| month_occurred | counter | counter_mean | |
|---|---|---|---|
| 0 | 1 | 73129 | 6094.083333 |
| 1 | 2 | 68679 | 5723.250000 |
| 2 | 3 | 71196 | 5933.000000 |
| 3 | 4 | 70299 | 5858.250000 |
| 4 | 5 | 73349 | 6112.416667 |
| 5 | 6 | 72891 | 6074.250000 |
| 6 | 7 | 75429 | 6285.750000 |
| 7 | 8 | 75072 | 6256.000000 |
| 8 | 9 | 72141 | 6011.750000 |
| 9 | 10 | 75464 | 6288.666667 |
| 10 | 11 | 69960 | 5830.000000 |
| 11 | 12 | 55341 | 4611.750000 |
fig = px.line(df_grouped
, x="month_occurred"
, y="counter_mean"
#,color ='year_occurred'
,title='Crime Month Seasonality'
,markers=True
)
fig.show('notebook')
fig = px.line(df.pivot_table(index=['year_occurred','month_occurred'] ,aggfunc='sum' , values='counter' ).reset_index()
, x="month_occurred"
, y="counter"
,color ='year_occurred'
,title='Crime Month Seasonality'
,markers=True
)
fig.show('notebook')
import plotly.express as px
# Assuming df is your DataFrame and it's already defined
fig = px.line(df.pivot_table(index=['year_occurred', 'month_occurred'], aggfunc='sum', values='counter').reset_index(),
x="month_occurred",
y="counter",
color='year_occurred',
title='Crime Month Seasonality',
markers=True)
# Add an annotation with an arrow for December 2023
fig.add_annotation(
x=12, # x-coordinate for December
y=df[(df['year_occurred'] == 2023) & (df['month_occurred'] == 12)]['counter'].sum(), # y-coordinate for December 2023
text="Missing data for December 2023", # The annotation text
showarrow=True, # Show an arrow pointing to the specified coordinates
arrowhead=1, # The style of the arrowhead (1 is a simple arrow)
ax=-120, # Horizontal offset of the arrow (adjust as needed)
ay=-50 # Vertical offset of the arrow (adjust as needed)
)
# Show the figure
fig.show('notebook')
# Assuming you have a DataFrame 'df' with columns 'area_name', 'counter', and 'crime_description'
df_crimes = df.pivot_table(index='area_name', aggfunc='sum', values='counter', columns='crime_description').reset_index()
# Set 'area_name' as the index
df_crimes = df_crimes.set_index('area_name')
import pandas as pd
# Assuming df_crimes is your DataFrame
# df_crimes = pd.read_csv('your_data.csv') # Example to load data
# Calculate the correlation matrix
correlation_matrix = df_crimes.corr()
# Display the correlation matrix
correlation_matrix
| crime_description | ARSON | ASSAULT WITH DEADLY WEAPON ON POLICE OFFICER | ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT | ATTEMPTED ROBBERY | BATTERY - SIMPLE ASSAULT | BATTERY ON A FIREFIGHTER | BATTERY POLICE (SIMPLE) | BATTERY WITH SEXUAL CONTACT | BEASTIALITY, CRIME AGAINST NATURE SEXUAL ASSLT WITH ANIM | BIGAMY | ... | UNAUTHORIZED COMPUTER ACCESS | VANDALISM - FELONY ($400 & OVER, ALL CHURCH VANDALISMS) | VANDALISM - MISDEAMEANOR ($399 OR UNDER) | VEHICLE - ATTEMPT STOLEN | VEHICLE - STOLEN | VEHICLE, STOLEN - OTHER (MOTORIZED SCOOTERS, BIKES, ETC) | VIOLATION OF COURT ORDER | VIOLATION OF RESTRAINING ORDER | VIOLATION OF TEMPORARY RESTRAINING ORDER | WEAPONS POSSESSION/BOMBING |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| crime_description | |||||||||||||||||||||
| ARSON | 1.000000 | 0.640382 | 0.802754 | 0.877036 | 0.795099 | 0.414322 | 0.792373 | 0.416257 | 0.319015 | -0.408223 | ... | -0.575377 | 0.762680 | 0.785109 | 0.394897 | 0.627737 | 0.012203 | -0.448538 | -0.021869 | -0.582394 | 0.220270 |
| ASSAULT WITH DEADLY WEAPON ON POLICE OFFICER | 0.640382 | 1.000000 | 0.558682 | 0.599608 | 0.674356 | 0.542911 | 0.737177 | 0.470088 | 0.203734 | -0.227827 | ... | -0.452161 | 0.585272 | 0.516159 | 0.114434 | 0.181087 | 0.079398 | -0.223461 | -0.031021 | -0.356584 | 0.038231 |
| ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT | 0.802754 | 0.558682 | 1.000000 | 0.921664 | 0.722539 | 0.267665 | 0.812042 | 0.191782 | 0.053449 | -0.492927 | ... | -0.614649 | 0.488878 | 0.875467 | 0.604097 | 0.764485 | 0.004481 | -0.276640 | 0.272391 | -0.298845 | 0.282560 |
| ATTEMPTED ROBBERY | 0.877036 | 0.599608 | 0.921664 | 1.000000 | 0.879820 | 0.461832 | 0.827545 | 0.440890 | 0.361852 | -0.538950 | ... | -0.534217 | 0.678861 | 0.807360 | 0.424522 | 0.608391 | 0.014359 | -0.383709 | 0.117981 | -0.351585 | 0.293896 |
| BATTERY - SIMPLE ASSAULT | 0.795099 | 0.674356 | 0.722539 | 0.879820 | 1.000000 | 0.666625 | 0.817322 | 0.749157 | 0.443872 | -0.418397 | ... | -0.397360 | 0.864910 | 0.695500 | 0.250183 | 0.315457 | 0.182661 | -0.527686 | -0.083608 | -0.382659 | 0.250179 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| VEHICLE, STOLEN - OTHER (MOTORIZED SCOOTERS, BIKES, ETC) | 0.012203 | 0.079398 | 0.004481 | 0.014359 | 0.182661 | 0.201154 | 0.125646 | 0.360140 | 0.509747 | 0.983821 | ... | -0.029351 | 0.331106 | 0.237177 | 0.437324 | 0.112618 | 1.000000 | -0.078017 | -0.093430 | -0.192871 | 0.273810 |
| VIOLATION OF COURT ORDER | -0.448538 | -0.223461 | -0.276640 | -0.383709 | -0.527686 | -0.322065 | -0.422450 | -0.534585 | 0.017299 | 0.308828 | ... | 0.036071 | -0.579071 | -0.269040 | 0.042368 | -0.014250 | -0.078017 | 1.000000 | 0.278040 | 0.416653 | -0.028942 |
| VIOLATION OF RESTRAINING ORDER | -0.021869 | -0.031021 | 0.272391 | 0.117981 | -0.083608 | -0.261454 | -0.058197 | -0.435435 | -0.625504 | -0.146899 | ... | -0.496845 | -0.287178 | 0.215279 | 0.412006 | 0.519665 | -0.093430 | 0.278040 | 1.000000 | -0.140528 | 0.325251 |
| VIOLATION OF TEMPORARY RESTRAINING ORDER | -0.582394 | -0.356584 | -0.298845 | -0.351585 | -0.382659 | -0.197052 | -0.416955 | -0.239865 | 0.363688 | -0.571571 | ... | 0.407601 | -0.587476 | -0.521707 | -0.235402 | -0.325704 | -0.192871 | 0.416653 | -0.140528 | 1.000000 | -0.349798 |
| WEAPONS POSSESSION/BOMBING | 0.220270 | 0.038231 | 0.282560 | 0.293896 | 0.250179 | 0.212912 | 0.356509 | 0.195228 | -0.722222 | 0.547723 | ... | -0.358772 | 0.308248 | 0.275263 | 0.256395 | 0.252524 | 0.273810 | -0.028942 | 0.325251 | -0.349798 | 1.000000 |
138 rows × 138 columns
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
# Assuming you have a DataFrame 'df' with columns 'area_name', 'counter', and 'crime_description'
# Creating the df_crimes DataFrame
df_crimes = df.pivot_table(index='area_name', aggfunc='sum', values='counter', columns='crime_description')
df_crimes = df_crimes.fillna(0)
# Preparing the data for clustering
X = df_crimes.values # Extracting the numerical values for clustering
# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Applying K-Means Clustering
k = 5 # Number of clusters
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X_scaled)
# Attaching the cluster labels to the original DataFrame
df_crimes['Cluster'] = kmeans.labels_
# Displaying the first few rows of the DataFrame with cluster labels
df_crimes.sort_values(by='Cluster')
C:\Users\antoi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
| crime_description | ARSON | ASSAULT WITH DEADLY WEAPON ON POLICE OFFICER | ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT | ATTEMPTED ROBBERY | BATTERY - SIMPLE ASSAULT | BATTERY ON A FIREFIGHTER | BATTERY POLICE (SIMPLE) | BATTERY WITH SEXUAL CONTACT | BEASTIALITY, CRIME AGAINST NATURE SEXUAL ASSLT WITH ANIM | BIGAMY | ... | VANDALISM - FELONY ($400 & OVER, ALL CHURCH VANDALISMS) | VANDALISM - MISDEAMEANOR ($399 OR UNDER) | VEHICLE - ATTEMPT STOLEN | VEHICLE - STOLEN | VEHICLE, STOLEN - OTHER (MOTORIZED SCOOTERS, BIKES, ETC) | VIOLATION OF COURT ORDER | VIOLATION OF RESTRAINING ORDER | VIOLATION OF TEMPORARY RESTRAINING ORDER | WEAPONS POSSESSION/BOMBING | Cluster |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| area_name | |||||||||||||||||||||
| Foothill | 86.0 | 52.0 | 1562.0 | 93.0 | 2173.0 | 6.0 | 60.0 | 126.0 | 1.0 | 0.0 | ... | 1705.0 | 789.0 | 71.0 | 3585.0 | 7.0 | 370.0 | 474.0 | 44.0 | 2.0 | 0 |
| Harbor | 84.0 | 23.0 | 2104.0 | 153.0 | 2815.0 | 9.0 | 74.0 | 111.0 | 0.0 | 0.0 | ... | 2294.0 | 1182.0 | 235.0 | 4885.0 | 26.0 | 353.0 | 1096.0 | 17.0 | 3.0 | 0 |
| Hollenbeck | 95.0 | 59.0 | 2203.0 | 156.0 | 2885.0 | 10.0 | 80.0 | 116.0 | 0.0 | 0.0 | ... | 2092.0 | 1119.0 | 197.0 | 5029.0 | 3.0 | 368.0 | 694.0 | 45.0 | 1.0 | 0 |
| Mission | 92.0 | 35.0 | 1798.0 | 149.0 | 2297.0 | 6.0 | 78.0 | 177.0 | 0.0 | 0.0 | ... | 1960.0 | 801.0 | 108.0 | 4662.0 | 2.0 | 338.0 | 925.0 | 35.0 | 0.0 | 0 |
| Newton | 172.0 | 49.0 | 3503.0 | 339.0 | 3904.0 | 6.0 | 97.0 | 165.0 | 0.0 | 1.0 | ... | 2571.0 | 1159.0 | 181.0 | 6693.0 | 5.0 | 208.0 | 831.0 | 38.0 | 3.0 | 0 |
| Southwest | 104.0 | 77.0 | 3197.0 | 272.0 | 4065.0 | 7.0 | 127.0 | 218.0 | 0.0 | 0.0 | ... | 2787.0 | 1415.0 | 255.0 | 5204.0 | 546.0 | 331.0 | 743.0 | 45.0 | 3.0 | 0 |
| Rampart | 125.0 | 37.0 | 2883.0 | 329.0 | 3833.0 | 19.0 | 93.0 | 195.0 | 0.0 | 1.0 | ... | 2437.0 | 1128.0 | 179.0 | 4366.0 | 58.0 | 253.0 | 438.0 | 36.0 | 2.0 | 0 |
| Central | 199.0 | 151.0 | 3755.0 | 436.0 | 6276.0 | 41.0 | 300.0 | 369.0 | 2.0 | 1.0 | ... | 4029.0 | 1389.0 | 115.0 | 3755.0 | 118.0 | 133.0 | 260.0 | 9.0 | 2.0 | 1 |
| 77th Street | 183.0 | 71.0 | 5334.0 | 447.0 | 4195.0 | 7.0 | 258.0 | 158.0 | 0.0 | 0.0 | ... | 2893.0 | 1602.0 | 238.0 | 7046.0 | 3.0 | 218.0 | 589.0 | 16.0 | 2.0 | 2 |
| Southeast | 149.0 | 82.0 | 4399.0 | 321.0 | 3500.0 | 10.0 | 202.0 | 116.0 | 1.0 | 0.0 | ... | 2255.0 | 1592.0 | 282.0 | 5825.0 | 1.0 | 264.0 | 690.0 | 36.0 | 0.0 | 2 |
| West LA | 73.0 | 5.0 | 722.0 | 94.0 | 2648.0 | 7.0 | 37.0 | 198.0 | 0.0 | 0.0 | ... | 2485.0 | 923.0 | 48.0 | 2831.0 | 162.0 | 163.0 | 297.0 | 17.0 | 1.0 | 3 |
| Van Nuys | 87.0 | 12.0 | 1467.0 | 107.0 | 2569.0 | 3.0 | 70.0 | 162.0 | 1.0 | 0.0 | ... | 2309.0 | 932.0 | 84.0 | 3636.0 | 32.0 | 343.0 | 385.0 | 33.0 | 0.0 | 3 |
| Topanga | 45.0 | 29.0 | 1417.0 | 112.0 | 2496.0 | 13.0 | 55.0 | 159.0 | 0.0 | 0.0 | ... | 1803.0 | 676.0 | 84.0 | 2704.0 | 16.0 | 215.0 | 652.0 | 89.0 | 1.0 | 3 |
| Northeast | 119.0 | 30.0 | 1541.0 | 185.0 | 2419.0 | 10.0 | 84.0 | 114.0 | 0.0 | 0.0 | ... | 2471.0 | 1048.0 | 140.0 | 4337.0 | 25.0 | 384.0 | 537.0 | 24.0 | 3.0 | 3 |
| West Valley | 75.0 | 28.0 | 1596.0 | 136.0 | 2457.0 | 14.0 | 64.0 | 179.0 | 2.0 | 0.0 | ... | 2112.0 | 819.0 | 117.0 | 3393.0 | 19.0 | 463.0 | 173.0 | 130.0 | 1.0 | 3 |
| N Hollywood | 127.0 | 32.0 | 1763.0 | 141.0 | 3036.0 | 5.0 | 85.0 | 190.0 | 0.0 | 1.0 | ... | 2816.0 | 1061.0 | 141.0 | 3960.0 | 48.0 | 87.0 | 102.0 | 26.0 | 1.0 | 3 |
| Devonshire | 47.0 | 13.0 | 1192.0 | 101.0 | 2361.0 | 4.0 | 56.0 | 147.0 | 0.0 | 0.0 | ... | 1716.0 | 671.0 | 77.0 | 3151.0 | 34.0 | 315.0 | 339.0 | 131.0 | 0.0 | 3 |
| Wilshire | 119.0 | 118.0 | 1550.0 | 160.0 | 2898.0 | 7.0 | 100.0 | 174.0 | 0.0 | 0.0 | ... | 2551.0 | 945.0 | 54.0 | 3326.0 | 47.0 | 307.0 | 354.0 | 11.0 | 1.0 | 3 |
| Olympic | 133.0 | 25.0 | 2470.0 | 280.0 | 4053.0 | 6.0 | 84.0 | 205.0 | 2.0 | 1.0 | ... | 2671.0 | 1186.0 | 146.0 | 4652.0 | 46.0 | 293.0 | 447.0 | 42.0 | 1.0 | 4 |
| Hollywood | 116.0 | 54.0 | 2540.0 | 270.0 | 4102.0 | 19.0 | 189.0 | 270.0 | 1.0 | 0.0 | ... | 3052.0 | 1022.0 | 62.0 | 3211.0 | 53.0 | 194.0 | 396.0 | 21.0 | 5.0 | 4 |
| Pacific | 119.0 | 32.0 | 1880.0 | 154.0 | 2994.0 | 20.0 | 139.0 | 216.0 | 0.0 | 2.0 | ... | 2817.0 | 1131.0 | 296.0 | 5222.0 | 544.0 | 255.0 | 327.0 | 10.0 | 3.0 | 4 |
21 rows × 139 columns
df_evo_2 = df_evo.merge(df_crimes,on='area_name')
df_evo_2.head(1)
| area_name | 2021 | 2023 | evol | colors | ARSON | ASSAULT WITH DEADLY WEAPON ON POLICE OFFICER | ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT | ATTEMPTED ROBBERY | BATTERY - SIMPLE ASSAULT | ... | VANDALISM - FELONY ($400 & OVER, ALL CHURCH VANDALISMS) | VANDALISM - MISDEAMEANOR ($399 OR UNDER) | VEHICLE - ATTEMPT STOLEN | VEHICLE - STOLEN | VEHICLE, STOLEN - OTHER (MOTORIZED SCOOTERS, BIKES, ETC) | VIOLATION OF COURT ORDER | VIOLATION OF RESTRAINING ORDER | VIOLATION OF TEMPORARY RESTRAINING ORDER | WEAPONS POSSESSION/BOMBING | Cluster | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Hollywood | 12084 | 10274 | -0.15 | green | 116.0 | 54.0 | 2540.0 | 270.0 | 4102.0 | ... | 3052.0 | 1022.0 | 62.0 | 3211.0 | 53.0 | 194.0 | 396.0 | 21.0 | 5.0 | 4 |
1 rows × 144 columns
df_evo_2['Cluster']=df_evo_2['Cluster'].astype(str)
fig = px.scatter(df_evo_2
, x="evol", y="2023", color="Cluster",
size='2023', hover_data=['2023'])
fig.show('notebook')
import pandas as pd
import plotly.express as px
# Ensure df_evo_2 is correctly set up with 'evol', '2023', 'Cluster', and 'area_name' columns
# Convert Cluster to a string for color coding
df_evo_2['Cluster'] = df_evo_2['Cluster'].astype(str)
# Create the scatter plot
fig = px.scatter(df_evo_2, x="evol", y="2023", color="Cluster",
size='2023', hover_data=['area_name', '2023'],
text='area_name') # Add area name as text labels on the plot
# Customize the layout
fig.update_traces(textposition='top center')
fig.update_layout(
title="Evolution of 2023 Compared to 2021 by Cluster",
xaxis_title="Evolution from 2021 to 2023",
yaxis_title="Value in 2023",
legend_title="Cluster",
plot_bgcolor='white',
paper_bgcolor='white'
)
# Customize hover template for better readability
fig.update_traces(hovertemplate="<br>".join([
"Area: %{hoverdata[0]}",
"2023 Value: %{hoverdata[1]}",
"Evolution: %{x}",
"Cluster: %{marker.color}"
]))
# Show the plot
fig.show('notebook')